{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in the data from the CSV file\n",
    "df = pd.read_csv('../chapter2/datasets/payment_fraud.csv')\n",
    "\n",
    "# Convert categorical feature into dummy variables with one-hot encoding\n",
    "df = pd.get_dummies(df, columns=['paymentMethod'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split dataset up into train and test sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    df.drop('label', axis=1), df['label'],\n",
    "    test_size=0.33, random_state=17)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize and train classifier model\n",
    "clf = LogisticRegression().fit(X_train, y_train)\n",
    "\n",
    "# Make predictions on test set\n",
    "y_pred = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.99992273816\n",
      "[[12753     0]\n",
      " [    1   189]]\n"
     ]
    }
   ],
   "source": [
    "# Compare test set predictions with ground truth labels\n",
    "print(accuracy_score(y_pred, y_test))\n",
    "print(confusion_matrix(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inspecting the trained Logistic Regression model coefficients & intercept"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-7.44949492  0.26692309  1.39595031 -1.44011704  1.41274547  1.32026309\n",
      "   0.20373255]]\n"
     ]
    }
   ],
   "source": [
    "print(clf.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 2.93674111]\n"
     ]
    }
   ],
   "source": [
    "print(clf.intercept_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[19]\n"
     ]
    }
   ],
   "source": [
    "print(clf.n_iter_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Let's go under the hood and implement Logistic Regression ourselves"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Logistic function, also known as the sigmoid function\n",
    "def logistic(x):\n",
    "    return 1 / (1 + np.exp(-x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Logistic regression cost function\n",
    "def cost(theta, X, y):\n",
    "    X = X.values\n",
    "    y = y.values\n",
    "    \n",
    "    # Note that we clip the minimum values to slightly above\n",
    "    # zero to avoid throwing an error when logarithm is applied\n",
    "    log_prob_zero = np.log(\n",
    "        (1 - logistic(np.dot(X, theta))).clip(min=1e-10))\n",
    "    log_prob_one = np.log(\n",
    "        logistic(np.dot(X, theta)).clip(min=1e-10))\n",
    "\n",
    "    # Calculating the log-likelihood terms\n",
    "    zero_likelihood = (1 - y) * log_prob_zero\n",
    "    one_likelihood = -y * log_prob_one\n",
    "    \n",
    "    # Summation across all the samples, then taking the mean\n",
    "    return np.sum(one_likelihood - zero_likelihood) / (len(X))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Logistic regression gradient function\n",
    "def gradient(theta, X, y):\n",
    "    X = X.values\n",
    "    y = y.values\n",
    "\n",
    "    num_params = theta.shape[0]\n",
    "    grad = np.zeros(num_params)\n",
    "    err = logistic(np.dot(X, theta)) - y\n",
    "\n",
    "    # Iterating through parameters and calculating\n",
    "    # gradient for each given current error\n",
    "    for i in range(num_params):\n",
    "        term = np.multiply(err, X[:, i])\n",
    "        grad[i] = np.sum(term) / len(X)\n",
    "    \n",
    "    return grad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Insert column of zeros for more convenient matrix multiplication\n",
    "X_train.insert(0, 'ones', 1)\n",
    "X_test.insert(0, 'ones', 1)\n",
    "\n",
    "# Seed for reproducibility\n",
    "np.random.seed(17)\n",
    "theta = np.random.rand(8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the value of the cost function before optimization (training the Logistic Regression model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20.38085906649756"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cost(theta, X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Through optimization, the parameters of the Logistic Regression model (coefficients and intercept) will be adjusted such that the value of the cost function is minimized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.6/site-packages/ipykernel_launcher.py:3: RuntimeWarning: overflow encountered in exp\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "from scipy.optimize import fmin_tnc\n",
    "\n",
    "res = fmin_tnc(func=cost, x0=theta, fprime=gradient, \n",
    "               args=(X_train, y_train))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the optimized combination of coefficients and intercept that result in a minimized cost function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 17.9286163 , -26.67080469,   0.58380376,   2.46179901,\n",
       "         -5.67978642,  10.65851254,  11.48534156,   4.5302039 ]), 38, 0)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After optimization, the cost went down from 20.38 to 3.29"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cost(res[0], X_train, y_train)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
